In [1]:
# Cargamos librerías
import pandas as pd 
import numpy as np
from pickle import TRUE

# para modelos
from sklearn.preprocessing import LabelEncoder,MinMaxScaler

# plots
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly.express as px

# manejo de datatime
import calendar
import datetime
from datetime import timedelta, datetime
from enum import IntEnum

# acceso a sistema
import sys
import os 

# GLOBAL_VARIABLES
from platform import python_version
CWD = os.getcwd()
DATA_PATH = os.path.join(CWD, "../data_dsmarket")

In [2]:
print("Working with these versions of libraries\n")
print(f"Numpy version {np.__version__}")
print(f"Pandas version {pd.__version__}")
print(f"Python version {python_version()}")

print("\nDirectorio actual\n",CWD)
print("\nDirectorio con conjunto de datos\n",DATA_PATH)

Working with these versions of libraries

Numpy version 1.24.1
Pandas version 1.5.3
Python version 3.8.16

Directorio actual
 /Users/nono/Downloads/nuclio/TFM/fase02_clusters

Directorio con conjunto de datos
 /Users/nono/Downloads/nuclio/TFM/fase02_clusters/../data_dsmarket


In [3]:
# Funciones

def datosGenerales(df):
    # Vemos un shape
    print("Su shape",df.shape);

    # vemos número de duplicados
    print("El número de duplicados es ",\
          df.duplicated().sum());
    
    # vemos recuento de nulos y si los hay, vemos su porcentaje
    return df.isnull().sum().apply(lambda row:
                                    f"{row} nulos, {round(row / df.shape[0]*100,2)} %"
                                       if row != 0 else f"{row} nulos ________")
    
def cambioFormato(dataset, atributos, tipo):
    dataset[atributos] = dataset[atributos].astype(tipo)

We upload the 3 datasets:

*   **df_sales** indicates the sales divided by day and location
*   **df_prices** indicates the products, its cathegory and the price that will depend on the week in which the product has been sold
*   **df_events** is a calendar that indicates the days, the week and the possible events

In [4]:
%%time
# Cargamos los ficheros csv

df_events = pd.read_csv(os.path.join(DATA_PATH, 'daily_calendar_with_events_new.csv'), sep=';')
df_prices = pd.read_csv(os.path.join(DATA_PATH,'item_prices.csv'), sep=',')
df_sales = pd.read_csv(os.path.join(DATA_PATH,'item_sales.csv'), sep=',')

CPU times: user 4.97 s, sys: 481 ms, total: 5.45 s
Wall time: 5.46 s


### Trabajamos con los Eventos

In [5]:
datosGenerales(df_events)

Su shape (1913, 7)
El número de duplicados es  0


date                0 nulos ________
mes                 0 nulos ________
weekday             0 nulos ________
weekday_int         0 nulos ________
d                   0 nulos ________
event            1887 nulos, 98.64 %
festivos_eeuu    1813 nulos, 94.77 %
dtype: object

In [6]:
df_events.head(15)

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu
0,29/1/2011,1,Saturday,1,d_1,,
1,30/1/2011,1,Sunday,2,d_2,,
2,31/1/2011,1,Monday,3,d_3,,
3,1/2/2011,2,Tuesday,4,d_4,,
4,2/2/2011,2,Wednesday,5,d_5,,
5,3/2/2011,2,Thursday,6,d_6,,
6,4/2/2011,2,Friday,7,d_7,,
7,5/2/2011,2,Saturday,1,d_8,,
8,6/2/2011,2,Sunday,2,d_9,SuperBowl,SuperBowl
9,7/2/2011,2,Monday,3,d_10,,


In [7]:
# Convertimos a datetime 'date'
df_events['date'] = pd.to_datetime(df_events['date'], format="%d/%m/%Y")

In [8]:
# Imputamos 'No event' a los nulos
df_events.fillna('No event', inplace=True)
df_events['event'].value_counts()

No event          1887
SuperBowl            6
Ramadan starts       5
Thanksgiving         5
NewYear              5
Easter               5
Name: event, dtype: int64

In [9]:
df_events.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1913 entries, 0 to 1912
Columns: 7 entries, date to festivos_eeuu
dtypes: datetime64[ns](1), int64(2), object(4)
memory usage: 104.7+ KB


In [10]:
#reviso con el head porque me aparecian otros meses en febrero, como que habia un cambio en el formato cuando agregué
#festivo_eeuu en el csv. Agregué el parámetro "format" para cambiar a formato date
df_events.head(5)

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu
0,2011-01-29,1,Saturday,1,d_1,No event,No event
1,2011-01-30,1,Sunday,2,d_2,No event,No event
2,2011-01-31,1,Monday,3,d_3,No event,No event
3,2011-02-01,2,Tuesday,4,d_4,No event,No event
4,2011-02-02,2,Wednesday,5,d_5,No event,No event


In [11]:
df_events['year']=df_events['date'].dt.isocalendar().year

In [12]:
df_events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1913 entries, 0 to 1912
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           1913 non-null   datetime64[ns]
 1   mes            1913 non-null   int64         
 2   weekday        1913 non-null   object        
 3   weekday_int    1913 non-null   int64         
 4   d              1913 non-null   object        
 5   event          1913 non-null   object        
 6   festivos_eeuu  1913 non-null   object        
 7   year           1913 non-null   UInt32        
dtypes: UInt32(1), datetime64[ns](1), int64(2), object(4)
memory usage: 114.1+ KB


In [13]:
# armo funcion para tener csv por año
def tabla_año(df,año,columna):
    evento_año=df[df[columna]==año]
  
    # Escribimos el nuevo DataFrame a un archivo csv
    filename = DATA_PATH + '/' + str(año) + '.csv'
    evento_año.to_csv(filename)   

In [14]:
%%time
tabla_año(df_events,2011,'year')
tabla_año(df_events,2012,'year')
tabla_año(df_events,2013,'year')
tabla_año(df_events,2014,'year')
tabla_año(df_events,2015,'year')
tabla_año(df_events,2016,'year')

CPU times: user 14.3 ms, sys: 2.73 ms, total: 17 ms
Wall time: 16.9 ms


#### Eventos 2011

In [15]:
df_events_2011 = pd.read_csv(f'{DATA_PATH}/2011.csv',sep=",")

In [16]:
df_events_2011.drop('Unnamed: 0',axis=1,inplace=True)
df_events_2011['date'] = pd.to_datetime(df_events_2011['date'])

In [17]:
#vuelvo a agregarle dos para q semana del año comience en sabado
df_events_2011['date_false']=df_events_2011['date']+ timedelta(days=2)

In [18]:
df_events_2011.head(10)

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year,date_false
0,2011-01-29,1,Saturday,1,d_1,No event,No event,2011,2011-01-31
1,2011-01-30,1,Sunday,2,d_2,No event,No event,2011,2011-02-01
2,2011-01-31,1,Monday,3,d_3,No event,No event,2011,2011-02-02
3,2011-02-01,2,Tuesday,4,d_4,No event,No event,2011,2011-02-03
4,2011-02-02,2,Wednesday,5,d_5,No event,No event,2011,2011-02-04
5,2011-02-03,2,Thursday,6,d_6,No event,No event,2011,2011-02-05
6,2011-02-04,2,Friday,7,d_7,No event,No event,2011,2011-02-06
7,2011-02-05,2,Saturday,1,d_8,No event,No event,2011,2011-02-07
8,2011-02-06,2,Sunday,2,d_9,SuperBowl,SuperBowl,2011,2011-02-08
9,2011-02-07,2,Monday,3,d_10,No event,No event,2011,2011-02-09


In [19]:
# obtenemos el año
df_events_2011['year'] = df_events_2011['date'].dt.isocalendar().year
df_events_2011['year_false'] = df_events_2011['date_false'].dt.isocalendar().year

# aplico isocalendar para obtener la week de cada año
df_events_2011['week'] = df_events_2011['date'].dt.isocalendar().week
df_events_2011['week_false'] = df_events_2011['date_false'].dt.isocalendar().week

In [20]:
df_events_2011.head(15)

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year,date_false,year_false,week,week_false
0,2011-01-29,1,Saturday,1,d_1,No event,No event,2011,2011-01-31,2011,4,5
1,2011-01-30,1,Sunday,2,d_2,No event,No event,2011,2011-02-01,2011,4,5
2,2011-01-31,1,Monday,3,d_3,No event,No event,2011,2011-02-02,2011,5,5
3,2011-02-01,2,Tuesday,4,d_4,No event,No event,2011,2011-02-03,2011,5,5
4,2011-02-02,2,Wednesday,5,d_5,No event,No event,2011,2011-02-04,2011,5,5
5,2011-02-03,2,Thursday,6,d_6,No event,No event,2011,2011-02-05,2011,5,5
6,2011-02-04,2,Friday,7,d_7,No event,No event,2011,2011-02-06,2011,5,5
7,2011-02-05,2,Saturday,1,d_8,No event,No event,2011,2011-02-07,2011,5,6
8,2011-02-06,2,Sunday,2,d_9,SuperBowl,SuperBowl,2011,2011-02-08,2011,5,6
9,2011-02-07,2,Monday,3,d_10,No event,No event,2011,2011-02-09,2011,6,6


In [21]:
df_events_2011.tail()

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year,date_false,year_false,week,week_false
333,2011-12-28,12,Wednesday,5,d_334,No event,No event,2011,2011-12-30,2011,52,52
334,2011-12-29,12,Thursday,6,d_335,No event,No event,2011,2011-12-31,2011,52,52
335,2011-12-30,12,Friday,7,d_336,No event,No event,2011,2012-01-01,2011,52,52
336,2011-12-31,12,Saturday,1,d_337,No event,No event,2011,2012-01-02,2012,52,1
337,2012-01-01,1,Sunday,2,d_338,NewYear,NewYear,2011,2012-01-03,2012,52,1


In [22]:
#la semana 1 que hay en este df es del año 2012, porque 31/12/11 y 1/1/12 es semana 1 del 2012
df_events_2011[['date']].where((df_events_2011['week_false']==1) & (df_events_2011['year_false']==2012 )).value_counts()

date      
2011-12-31    1
2012-01-01    1
dtype: int64

In [23]:
#la unica semana sin 7 dias es la w1 del 2012
((df_events_2011.groupby(['year','week_false'])['date'].count())==7).value_counts()

True     48
False     1
Name: date, dtype: int64

In [24]:
#esta week_false=1 es la del 2012
df_events_2011.pivot_table(index='week_false', values='date', aggfunc=[len]).head()

Unnamed: 0_level_0,len
Unnamed: 0_level_1,date
week_false,Unnamed: 1_level_2
1,2
5,7
6,7
7,7
8,7


In [25]:
#chequeo la ultima semana
df_events_2011.tail()

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year,date_false,year_false,week,week_false
333,2011-12-28,12,Wednesday,5,d_334,No event,No event,2011,2011-12-30,2011,52,52
334,2011-12-29,12,Thursday,6,d_335,No event,No event,2011,2011-12-31,2011,52,52
335,2011-12-30,12,Friday,7,d_336,No event,No event,2011,2012-01-01,2011,52,52
336,2011-12-31,12,Saturday,1,d_337,No event,No event,2011,2012-01-02,2012,52,1
337,2012-01-01,1,Sunday,2,d_338,NewYear,NewYear,2011,2012-01-03,2012,52,1


#### Eventos 2012

In [26]:
df_events_2012 = pd.read_csv(f'{DATA_PATH}/2012.csv',sep=",")

In [27]:
#armo funcion porque voy a repetir lo mismo para cada año
def calendario_años(df):


    df.drop('Unnamed: 0',axis=1,inplace=True)
    df['date'] = pd.to_datetime(df['date'])
    df['date_false']=df['date']+ timedelta(days=2)

# obtenemos el año
    df['year'] = df['date'].dt.isocalendar().year
    df['year_false'] = df['date_false'].dt.isocalendar().year

# aplico isocalendar para obtener la week de cada año

    df['week'] = df['date'].dt.isocalendar().week
    df['week_false'] = df['date_false'].dt.isocalendar().week
   
    
    print(((df.groupby(['year','week_false'])['date'].count())>7).value_counts())

In [28]:
calendario_años(df_events_2012)

False    52
Name: date, dtype: int64


In [29]:
#del 31/12/11 al 6/1/2012 es w1 del 2012
df_events_2012.head(20)

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year,date_false,year_false,week,week_false
0,2012-01-02,1,Monday,3,d_339,No event,No event,2012,2012-01-04,2012,1,1
1,2012-01-03,1,Tuesday,4,d_340,No event,No event,2012,2012-01-05,2012,1,1
2,2012-01-04,1,Wednesday,5,d_341,No event,No event,2012,2012-01-06,2012,1,1
3,2012-01-05,1,Thursday,6,d_342,No event,No event,2012,2012-01-07,2012,1,1
4,2012-01-06,1,Friday,7,d_343,No event,No event,2012,2012-01-08,2012,1,1
5,2012-01-07,1,Saturday,1,d_344,No event,No event,2012,2012-01-09,2012,1,2
6,2012-01-08,1,Sunday,2,d_345,No event,No event,2012,2012-01-10,2012,1,2
7,2012-01-09,1,Monday,3,d_346,No event,No event,2012,2012-01-11,2012,2,2
8,2012-01-10,1,Tuesday,4,d_347,No event,No event,2012,2012-01-12,2012,2,2
9,2012-01-11,1,Wednesday,5,d_348,No event,No event,2012,2012-01-13,2012,2,2


In [30]:
#reviso fin de año
df_events_2012.tail(10)

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year,date_false,year_false,week,week_false
354,2012-12-21,12,Friday,7,d_693,No event,No event,2012,2012-12-23,2012,51,51
355,2012-12-22,12,Saturday,1,d_694,No event,No event,2012,2012-12-24,2012,51,52
356,2012-12-23,12,Sunday,2,d_695,No event,No event,2012,2012-12-25,2012,51,52
357,2012-12-24,12,Monday,3,d_696,No event,No event,2012,2012-12-26,2012,52,52
358,2012-12-25,12,Tuesday,4,d_697,No event,Navidad,2012,2012-12-27,2012,52,52
359,2012-12-26,12,Wednesday,5,d_698,No event,No event,2012,2012-12-28,2012,52,52
360,2012-12-27,12,Thursday,6,d_699,No event,No event,2012,2012-12-29,2012,52,52
361,2012-12-28,12,Friday,7,d_700,No event,No event,2012,2012-12-30,2012,52,52
362,2012-12-29,12,Saturday,1,d_701,No event,No event,2012,2012-12-31,2013,52,1
363,2012-12-30,12,Sunday,2,d_702,No event,No event,2012,2013-01-01,2013,52,1


In [31]:
#veo 52 y en price hay 53, es porque del 29/12/12 al 4/1/13 es w53 del 2012
((df_events_2012.groupby(['year','week_false'])['date'].count())>7).value_counts()

False    52
Name: date, dtype: int64

In [32]:
#cambiamos 29/12 y 30/12 que seria semana 53 del 2012
df_events_2012.loc[362,'week_false'] = int(53)
df_events_2012.loc[362,'year_false'] = int(2012)

df_events_2012.loc[363,'week_false'] = int(53)
df_events_2012.loc[363,'year_false'] = int(2012)

In [33]:
df_events_2012.tail(10)

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year,date_false,year_false,week,week_false
354,2012-12-21,12,Friday,7,d_693,No event,No event,2012,2012-12-23,2012,51,51
355,2012-12-22,12,Saturday,1,d_694,No event,No event,2012,2012-12-24,2012,51,52
356,2012-12-23,12,Sunday,2,d_695,No event,No event,2012,2012-12-25,2012,51,52
357,2012-12-24,12,Monday,3,d_696,No event,No event,2012,2012-12-26,2012,52,52
358,2012-12-25,12,Tuesday,4,d_697,No event,Navidad,2012,2012-12-27,2012,52,52
359,2012-12-26,12,Wednesday,5,d_698,No event,No event,2012,2012-12-28,2012,52,52
360,2012-12-27,12,Thursday,6,d_699,No event,No event,2012,2012-12-29,2012,52,52
361,2012-12-28,12,Friday,7,d_700,No event,No event,2012,2012-12-30,2012,52,52
362,2012-12-29,12,Saturday,1,d_701,No event,No event,2012,2012-12-31,2012,52,53
363,2012-12-30,12,Sunday,2,d_702,No event,No event,2012,2013-01-01,2012,52,53


#### Eventos 2013

In [34]:
df_events_2013 = pd.read_csv(f'{DATA_PATH}/2013.csv',sep=",")

In [35]:
calendario_años(df_events_2013)

False    52
Name: date, dtype: int64


In [36]:
#del 29/12/12 al 4/1/13 es w53 del 2012- HAY QUE CORREGIR
df_events_2013.head(10)

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year,date_false,year_false,week,week_false
0,2012-12-31,12,Monday,3,d_703,No event,No event,2013,2013-01-02,2013,1,1
1,2013-01-01,1,Tuesday,4,d_704,NewYear,NewYear,2013,2013-01-03,2013,1,1
2,2013-01-02,1,Wednesday,5,d_705,No event,No event,2013,2013-01-04,2013,1,1
3,2013-01-03,1,Thursday,6,d_706,No event,No event,2013,2013-01-05,2013,1,1
4,2013-01-04,1,Friday,7,d_707,No event,No event,2013,2013-01-06,2013,1,1
5,2013-01-05,1,Saturday,1,d_708,No event,No event,2013,2013-01-07,2013,1,2
6,2013-01-06,1,Sunday,2,d_709,No event,No event,2013,2013-01-08,2013,1,2
7,2013-01-07,1,Monday,3,d_710,No event,No event,2013,2013-01-09,2013,2,2
8,2013-01-08,1,Tuesday,4,d_711,No event,No event,2013,2013-01-10,2013,2,2
9,2013-01-09,1,Wednesday,5,d_712,No event,No event,2013,2013-01-11,2013,2,2


In [37]:
df_events_2013['week_false']=df_events_2013['week_false'].astype('int')
df_events_2013['week_false']=df_events_2013['week_false']-1 #como la q figura 1, en realidad es la 53 del año anterior
#las siguientes se corren, entoncse tengo q vovler a poner en 1 desde el sabado 5/1, POR ESO RESTO 1
df_events_2013['week_false']=df_events_2013['week_false'].astype('str')

In [38]:
df_events_2013.head()

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year,date_false,year_false,week,week_false
0,2012-12-31,12,Monday,3,d_703,No event,No event,2013,2013-01-02,2013,1,0
1,2013-01-01,1,Tuesday,4,d_704,NewYear,NewYear,2013,2013-01-03,2013,1,0
2,2013-01-02,1,Wednesday,5,d_705,No event,No event,2013,2013-01-04,2013,1,0
3,2013-01-03,1,Thursday,6,d_706,No event,No event,2013,2013-01-05,2013,1,0
4,2013-01-04,1,Friday,7,d_707,No event,No event,2013,2013-01-06,2013,1,0


In [39]:
#del 29/12/12 al 4/1/13 es w53 del 2012 --> cambiamos la parte de 2012
df_events_2013.loc[0,'week_false'] = int(53)
df_events_2013.loc[0,'year_false'] = int(2012)

df_events_2013.loc[1,'week_false'] = int(53)
df_events_2013.loc[1,'year_false'] = int(2012)

df_events_2013.loc[2,'week_false'] = int(53)
df_events_2013.loc[2,'year_false'] = int(2012)

df_events_2013.loc[3,'week_false'] = int(53)
df_events_2013.loc[3,'year_false'] = int(2012)

df_events_2013.loc[4,'week_false'] = int(53)
df_events_2013.loc[4,'year_false'] = int(2012)

In [40]:
df_events_2013.head(10)

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year,date_false,year_false,week,week_false
0,2012-12-31,12,Monday,3,d_703,No event,No event,2013,2013-01-02,2012,1,53
1,2013-01-01,1,Tuesday,4,d_704,NewYear,NewYear,2013,2013-01-03,2012,1,53
2,2013-01-02,1,Wednesday,5,d_705,No event,No event,2013,2013-01-04,2012,1,53
3,2013-01-03,1,Thursday,6,d_706,No event,No event,2013,2013-01-05,2012,1,53
4,2013-01-04,1,Friday,7,d_707,No event,No event,2013,2013-01-06,2012,1,53
5,2013-01-05,1,Saturday,1,d_708,No event,No event,2013,2013-01-07,2013,1,1
6,2013-01-06,1,Sunday,2,d_709,No event,No event,2013,2013-01-08,2013,1,1
7,2013-01-07,1,Monday,3,d_710,No event,No event,2013,2013-01-09,2013,2,1
8,2013-01-08,1,Tuesday,4,d_711,No event,No event,2013,2013-01-10,2013,2,1
9,2013-01-09,1,Wednesday,5,d_712,No event,No event,2013,2013-01-11,2013,2,1


In [41]:
#REVISO FIN DE AÑO DEL 2013
df_events_2013.tail(10)

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year,date_false,year_false,week,week_false
354,2013-12-20,12,Friday,7,d_1057,No event,No event,2013,2013-12-22,2013,51,50
355,2013-12-21,12,Saturday,1,d_1058,No event,No event,2013,2013-12-23,2013,51,51
356,2013-12-22,12,Sunday,2,d_1059,No event,No event,2013,2013-12-24,2013,51,51
357,2013-12-23,12,Monday,3,d_1060,No event,No event,2013,2013-12-25,2013,52,51
358,2013-12-24,12,Tuesday,4,d_1061,No event,No event,2013,2013-12-26,2013,52,51
359,2013-12-25,12,Wednesday,5,d_1062,No event,Navidad,2013,2013-12-27,2013,52,51
360,2013-12-26,12,Thursday,6,d_1063,No event,No event,2013,2013-12-28,2013,52,51
361,2013-12-27,12,Friday,7,d_1064,No event,No event,2013,2013-12-29,2013,52,51
362,2013-12-28,12,Saturday,1,d_1065,No event,No event,2013,2013-12-30,2014,52,0
363,2013-12-29,12,Sunday,2,d_1066,No event,No event,2013,2013-12-31,2014,52,0


In [42]:
#del sabado 28/12 al 3/1/14 es semana 52 del 2013
df_events_2013.loc[362,'week_false'] = int(52)
df_events_2013.loc[362,'year_false'] = int(2013)

df_events_2013.loc[363,'week_false'] = int(52)
df_events_2013.loc[363,'year_false'] = int(2013)

In [43]:
((df_events_2013.groupby(['year','week_false'])['date'].count())>7).value_counts()
#da 53 pq cuenta la semana 53 del año anterior, como un unique del 2013 

False    53
Name: date, dtype: int64

#### Eventos 2014

In [44]:
df_events_2014 = pd.read_csv(f'{DATA_PATH}/2014.csv',sep=",")

In [45]:
calendario_años(df_events_2014)

False    52
Name: date, dtype: int64


In [46]:
#del sabado 28/12 al 3/1/14 debe semana 52 del 2013
df_events_2014.head(10)

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year,date_false,year_false,week,week_false
0,2013-12-30,12,Monday,3,d_1067,No event,No event,2014,2014-01-01,2014,1,1
1,2013-12-31,12,Tuesday,4,d_1068,No event,No event,2014,2014-01-02,2014,1,1
2,2014-01-01,1,Wednesday,5,d_1069,NewYear,NewYear,2014,2014-01-03,2014,1,1
3,2014-01-02,1,Thursday,6,d_1070,No event,No event,2014,2014-01-04,2014,1,1
4,2014-01-03,1,Friday,7,d_1071,No event,No event,2014,2014-01-05,2014,1,1
5,2014-01-04,1,Saturday,1,d_1072,No event,No event,2014,2014-01-06,2014,1,2
6,2014-01-05,1,Sunday,2,d_1073,No event,No event,2014,2014-01-07,2014,1,2
7,2014-01-06,1,Monday,3,d_1074,No event,No event,2014,2014-01-08,2014,2,2
8,2014-01-07,1,Tuesday,4,d_1075,No event,No event,2014,2014-01-09,2014,2,2
9,2014-01-08,1,Wednesday,5,d_1076,No event,No event,2014,2014-01-10,2014,2,2


In [47]:
def mover_fecha(df):
    df['week_false']=df['week_false'].astype('int')
    df['week_false']=df['week_false']-1 
    df['week_false']=df['week_false'].astype('str')

In [48]:
mover_fecha(df_events_2014)

In [49]:
df_events_2014.head(10)

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year,date_false,year_false,week,week_false
0,2013-12-30,12,Monday,3,d_1067,No event,No event,2014,2014-01-01,2014,1,0
1,2013-12-31,12,Tuesday,4,d_1068,No event,No event,2014,2014-01-02,2014,1,0
2,2014-01-01,1,Wednesday,5,d_1069,NewYear,NewYear,2014,2014-01-03,2014,1,0
3,2014-01-02,1,Thursday,6,d_1070,No event,No event,2014,2014-01-04,2014,1,0
4,2014-01-03,1,Friday,7,d_1071,No event,No event,2014,2014-01-05,2014,1,0
5,2014-01-04,1,Saturday,1,d_1072,No event,No event,2014,2014-01-06,2014,1,1
6,2014-01-05,1,Sunday,2,d_1073,No event,No event,2014,2014-01-07,2014,1,1
7,2014-01-06,1,Monday,3,d_1074,No event,No event,2014,2014-01-08,2014,2,1
8,2014-01-07,1,Tuesday,4,d_1075,No event,No event,2014,2014-01-09,2014,2,1
9,2014-01-08,1,Wednesday,5,d_1076,No event,No event,2014,2014-01-10,2014,2,1


In [50]:
#corrijo para la semana 52 del 2013
df_events_2014.loc[0,'week_false'] = int(52)
df_events_2014.loc[0,'year_false'] = int(2013)

df_events_2014.loc[1,'week_false'] = int(52)
df_events_2014.loc[1,'year_false'] = int(2013)

df_events_2014.loc[2,'week_false'] = int(52)
df_events_2014.loc[2,'year_false'] = int(2013)

df_events_2014.loc[3,'week_false'] = int(52)
df_events_2014.loc[3,'year_false'] = int(2013)

df_events_2014.loc[4,'week_false'] = int(52)
df_events_2014.loc[4,'year_false'] = int(2013)

In [51]:
df_events_2014.head(10)

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year,date_false,year_false,week,week_false
0,2013-12-30,12,Monday,3,d_1067,No event,No event,2014,2014-01-01,2013,1,52
1,2013-12-31,12,Tuesday,4,d_1068,No event,No event,2014,2014-01-02,2013,1,52
2,2014-01-01,1,Wednesday,5,d_1069,NewYear,NewYear,2014,2014-01-03,2013,1,52
3,2014-01-02,1,Thursday,6,d_1070,No event,No event,2014,2014-01-04,2013,1,52
4,2014-01-03,1,Friday,7,d_1071,No event,No event,2014,2014-01-05,2013,1,52
5,2014-01-04,1,Saturday,1,d_1072,No event,No event,2014,2014-01-06,2014,1,1
6,2014-01-05,1,Sunday,2,d_1073,No event,No event,2014,2014-01-07,2014,1,1
7,2014-01-06,1,Monday,3,d_1074,No event,No event,2014,2014-01-08,2014,2,1
8,2014-01-07,1,Tuesday,4,d_1075,No event,No event,2014,2014-01-09,2014,2,1
9,2014-01-08,1,Wednesday,5,d_1076,No event,No event,2014,2014-01-10,2014,2,1


In [52]:
#del 27/12/14 al 2/1/2015 es semana 52 del 2014
df_events_2014.tail(10)

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year,date_false,year_false,week,week_false
354,2014-12-19,12,Friday,7,d_1421,No event,No event,2014,2014-12-21,2014,51,50
355,2014-12-20,12,Saturday,1,d_1422,No event,No event,2014,2014-12-22,2014,51,51
356,2014-12-21,12,Sunday,2,d_1423,No event,No event,2014,2014-12-23,2014,51,51
357,2014-12-22,12,Monday,3,d_1424,No event,No event,2014,2014-12-24,2014,52,51
358,2014-12-23,12,Tuesday,4,d_1425,No event,No event,2014,2014-12-25,2014,52,51
359,2014-12-24,12,Wednesday,5,d_1426,No event,No event,2014,2014-12-26,2014,52,51
360,2014-12-25,12,Thursday,6,d_1427,No event,Navidad,2014,2014-12-27,2014,52,51
361,2014-12-26,12,Friday,7,d_1428,No event,No event,2014,2014-12-28,2014,52,51
362,2014-12-27,12,Saturday,1,d_1429,No event,No event,2014,2014-12-29,2015,52,0
363,2014-12-28,12,Sunday,2,d_1430,No event,No event,2014,2014-12-30,2015,52,0


In [53]:
df_events_2014.loc[362,'week_false'] = int(52)
df_events_2014.loc[362,'year_false'] = int(2014)

df_events_2014.loc[363,'week_false'] = int(52)
df_events_2014.loc[363,'year_false'] = int(2014)

In [54]:
df_events_2014.tail()

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year,date_false,year_false,week,week_false
359,2014-12-24,12,Wednesday,5,d_1426,No event,No event,2014,2014-12-26,2014,52,51
360,2014-12-25,12,Thursday,6,d_1427,No event,Navidad,2014,2014-12-27,2014,52,51
361,2014-12-26,12,Friday,7,d_1428,No event,No event,2014,2014-12-28,2014,52,51
362,2014-12-27,12,Saturday,1,d_1429,No event,No event,2014,2014-12-29,2014,52,52
363,2014-12-28,12,Sunday,2,d_1430,No event,No event,2014,2014-12-30,2014,52,52


#### Eventos 2015

In [55]:
df_events_2015 = pd.read_csv(f'{DATA_PATH}/2015.csv',sep=",")

In [56]:
calendario_años(df_events_2015)

False    53
Name: date, dtype: int64


In [57]:
#del 27/12/14 al 2/1/2015 es semana 52 del 2014

df_events_2015.head(10)

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year,date_false,year_false,week,week_false
0,2014-12-29,12,Monday,3,d_1431,No event,No event,2015,2014-12-31,2015,1,1
1,2014-12-30,12,Tuesday,4,d_1432,No event,No event,2015,2015-01-01,2015,1,1
2,2014-12-31,12,Wednesday,5,d_1433,No event,No event,2015,2015-01-02,2015,1,1
3,2015-01-01,1,Thursday,6,d_1434,NewYear,NewYear,2015,2015-01-03,2015,1,1
4,2015-01-02,1,Friday,7,d_1435,No event,No event,2015,2015-01-04,2015,1,1
5,2015-01-03,1,Saturday,1,d_1436,No event,No event,2015,2015-01-05,2015,1,2
6,2015-01-04,1,Sunday,2,d_1437,No event,No event,2015,2015-01-06,2015,1,2
7,2015-01-05,1,Monday,3,d_1438,No event,No event,2015,2015-01-07,2015,2,2
8,2015-01-06,1,Tuesday,4,d_1439,No event,No event,2015,2015-01-08,2015,2,2
9,2015-01-07,1,Wednesday,5,d_1440,No event,No event,2015,2015-01-09,2015,2,2


In [58]:
mover_fecha(df_events_2015)

In [59]:
df_events_2015.head(10)

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year,date_false,year_false,week,week_false
0,2014-12-29,12,Monday,3,d_1431,No event,No event,2015,2014-12-31,2015,1,0
1,2014-12-30,12,Tuesday,4,d_1432,No event,No event,2015,2015-01-01,2015,1,0
2,2014-12-31,12,Wednesday,5,d_1433,No event,No event,2015,2015-01-02,2015,1,0
3,2015-01-01,1,Thursday,6,d_1434,NewYear,NewYear,2015,2015-01-03,2015,1,0
4,2015-01-02,1,Friday,7,d_1435,No event,No event,2015,2015-01-04,2015,1,0
5,2015-01-03,1,Saturday,1,d_1436,No event,No event,2015,2015-01-05,2015,1,1
6,2015-01-04,1,Sunday,2,d_1437,No event,No event,2015,2015-01-06,2015,1,1
7,2015-01-05,1,Monday,3,d_1438,No event,No event,2015,2015-01-07,2015,2,1
8,2015-01-06,1,Tuesday,4,d_1439,No event,No event,2015,2015-01-08,2015,2,1
9,2015-01-07,1,Wednesday,5,d_1440,No event,No event,2015,2015-01-09,2015,2,1


In [60]:
df_events_2015.loc[0,'week_false'] = int(52)
df_events_2015.loc[0,'year_false'] = int(2014)

df_events_2015.loc[1,'week_false'] = int(52)
df_events_2015.loc[1,'year_false'] = int(2014)

df_events_2015.loc[2,'week_false'] = int(52)
df_events_2015.loc[2,'year_false'] = int(2014)

df_events_2015.loc[3,'week_false'] = int(52)
df_events_2015.loc[3,'year_false'] = int(2014)

df_events_2015.loc[4,'week_false'] = int(52)
df_events_2015.loc[4,'year_false'] = int(2014)

In [61]:
df_events_2015.head(10)

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year,date_false,year_false,week,week_false
0,2014-12-29,12,Monday,3,d_1431,No event,No event,2015,2014-12-31,2014,1,52
1,2014-12-30,12,Tuesday,4,d_1432,No event,No event,2015,2015-01-01,2014,1,52
2,2014-12-31,12,Wednesday,5,d_1433,No event,No event,2015,2015-01-02,2014,1,52
3,2015-01-01,1,Thursday,6,d_1434,NewYear,NewYear,2015,2015-01-03,2014,1,52
4,2015-01-02,1,Friday,7,d_1435,No event,No event,2015,2015-01-04,2014,1,52
5,2015-01-03,1,Saturday,1,d_1436,No event,No event,2015,2015-01-05,2015,1,1
6,2015-01-04,1,Sunday,2,d_1437,No event,No event,2015,2015-01-06,2015,1,1
7,2015-01-05,1,Monday,3,d_1438,No event,No event,2015,2015-01-07,2015,2,1
8,2015-01-06,1,Tuesday,4,d_1439,No event,No event,2015,2015-01-08,2015,2,1
9,2015-01-07,1,Wednesday,5,d_1440,No event,No event,2015,2015-01-09,2015,2,1


In [62]:
df_events_2015.tail(10)

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year,date_false,year_false,week,week_false
361,2015-12-25,12,Friday,7,d_1792,No event,Navidad,2015,2015-12-27,2015,52,51
362,2015-12-26,12,Saturday,1,d_1793,No event,No event,2015,2015-12-28,2015,52,52
363,2015-12-27,12,Sunday,2,d_1794,No event,No event,2015,2015-12-29,2015,52,52
364,2015-12-28,12,Monday,3,d_1795,No event,No event,2015,2015-12-30,2015,53,52
365,2015-12-29,12,Tuesday,4,d_1796,No event,No event,2015,2015-12-31,2015,53,52
366,2015-12-30,12,Wednesday,5,d_1797,No event,No event,2015,2016-01-01,2015,53,52
367,2015-12-31,12,Thursday,6,d_1798,No event,No event,2015,2016-01-02,2015,53,52
368,2016-01-01,1,Friday,7,d_1799,NewYear,NewYear,2015,2016-01-03,2015,53,52
369,2016-01-02,1,Saturday,1,d_1800,No event,No event,2015,2016-01-04,2016,53,0
370,2016-01-03,1,Sunday,2,d_1801,No event,No event,2015,2016-01-05,2016,53,0


In [63]:
df_events_2015.loc[369,'week_false'] = int(1)
df_events_2015.loc[369,'year_false'] = int(2016)

df_events_2015.loc[370,'week_false'] = int(1)
df_events_2015.loc[370,'year_false'] = int(2016)

In [64]:
df_events_2015.tail(10)

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year,date_false,year_false,week,week_false
361,2015-12-25,12,Friday,7,d_1792,No event,Navidad,2015,2015-12-27,2015,52,51
362,2015-12-26,12,Saturday,1,d_1793,No event,No event,2015,2015-12-28,2015,52,52
363,2015-12-27,12,Sunday,2,d_1794,No event,No event,2015,2015-12-29,2015,52,52
364,2015-12-28,12,Monday,3,d_1795,No event,No event,2015,2015-12-30,2015,53,52
365,2015-12-29,12,Tuesday,4,d_1796,No event,No event,2015,2015-12-31,2015,53,52
366,2015-12-30,12,Wednesday,5,d_1797,No event,No event,2015,2016-01-01,2015,53,52
367,2015-12-31,12,Thursday,6,d_1798,No event,No event,2015,2016-01-02,2015,53,52
368,2016-01-01,1,Friday,7,d_1799,NewYear,NewYear,2015,2016-01-03,2015,53,52
369,2016-01-02,1,Saturday,1,d_1800,No event,No event,2015,2016-01-04,2016,53,1
370,2016-01-03,1,Sunday,2,d_1801,No event,No event,2015,2016-01-05,2016,53,1


#### Eventos 2016

In [65]:
df_events_2016 = pd.read_csv(f'{DATA_PATH}/2016.csv',sep=",")

In [66]:
calendario_años(df_events_2016)

False    17
Name: date, dtype: int64


In [67]:
df_events_2016.head(50)

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year,date_false,year_false,week,week_false
0,2016-01-04,1,Monday,3,d_1802,No event,No event,2016,2016-01-06,2016,1,1
1,2016-01-05,1,Tuesday,4,d_1803,No event,No event,2016,2016-01-07,2016,1,1
2,2016-01-06,1,Wednesday,5,d_1804,No event,No event,2016,2016-01-08,2016,1,1
3,2016-01-07,1,Thursday,6,d_1805,No event,No event,2016,2016-01-09,2016,1,1
4,2016-01-08,1,Friday,7,d_1806,No event,No event,2016,2016-01-10,2016,1,1
5,2016-01-09,1,Saturday,1,d_1807,No event,No event,2016,2016-01-11,2016,1,2
6,2016-01-10,1,Sunday,2,d_1808,No event,No event,2016,2016-01-12,2016,1,2
7,2016-01-11,1,Monday,3,d_1809,No event,No event,2016,2016-01-13,2016,2,2
8,2016-01-12,1,Tuesday,4,d_1810,No event,No event,2016,2016-01-14,2016,2,2
9,2016-01-13,1,Wednesday,5,d_1811,No event,No event,2016,2016-01-15,2016,2,2


### Rearmamos de nuevo Eventos

In [68]:
# Apilar los df uno encima del otro
df_calendar_final = pd.concat([df_events_2011, df_events_2012,df_events_2013,df_events_2014,df_events_2015,df_events_2016], axis=0)

In [69]:
df_calendar_final

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year,date_false,year_false,week,week_false
0,2011-01-29,1,Saturday,1,d_1,No event,No event,2011,2011-01-31,2011,4,5
1,2011-01-30,1,Sunday,2,d_2,No event,No event,2011,2011-02-01,2011,4,5
2,2011-01-31,1,Monday,3,d_3,No event,No event,2011,2011-02-02,2011,5,5
3,2011-02-01,2,Tuesday,4,d_4,No event,No event,2011,2011-02-03,2011,5,5
4,2011-02-02,2,Wednesday,5,d_5,No event,No event,2011,2011-02-04,2011,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...
107,2016-04-20,4,Wednesday,5,d_1909,No event,No event,2016,2016-04-22,2016,16,16
108,2016-04-21,4,Thursday,6,d_1910,No event,No event,2016,2016-04-23,2016,16,16
109,2016-04-22,4,Friday,7,d_1911,No event,No event,2016,2016-04-24,2016,16,16
110,2016-04-23,4,Saturday,1,d_1912,No event,No event,2016,2016-04-25,2016,16,17


In [70]:
df_calendar_final.pivot_table(index='year_false', values='week_false', aggfunc=lambda x: len(x.unique()))

Unnamed: 0_level_0,week_false
year_false,Unnamed: 1_level_1
2011,48
2012,53
2013,52
2014,52
2015,52
2016,17


In [71]:
df_calendar_final['week_false']=df_calendar_final['week_false'].astype('int')
df_calendar_final['year_false']=df_calendar_final['year_false'].astype('int')

In [72]:
df_calendar_final[['year_false','week_false']].groupby('year_false').max()

Unnamed: 0_level_0,week_false
year_false,Unnamed: 1_level_1
2011,52
2012,53
2013,52
2014,52
2015,52
2016,17


In [73]:
df_calendar_final['week_false'] = df_calendar_final['week_false'].astype(str) # lo tengo q convertir a str para aplicar zfill
df_calendar_final['week_false'] = df_calendar_final['week_false'].apply(lambda x: x.zfill(2)) # para que tenga 2 dígitos

df_calendar_final['year_false'] = df_calendar_final['year_false'].astype(str)

In [74]:
#concateno
df_calendar_final['yearweek'] = df_calendar_final.year_false.str.cat(df_calendar_final.week_false)

In [75]:
df_calendar_final.head(10)

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year,date_false,year_false,week,week_false,yearweek
0,2011-01-29,1,Saturday,1,d_1,No event,No event,2011,2011-01-31,2011,4,5,201105
1,2011-01-30,1,Sunday,2,d_2,No event,No event,2011,2011-02-01,2011,4,5,201105
2,2011-01-31,1,Monday,3,d_3,No event,No event,2011,2011-02-02,2011,5,5,201105
3,2011-02-01,2,Tuesday,4,d_4,No event,No event,2011,2011-02-03,2011,5,5,201105
4,2011-02-02,2,Wednesday,5,d_5,No event,No event,2011,2011-02-04,2011,5,5,201105
5,2011-02-03,2,Thursday,6,d_6,No event,No event,2011,2011-02-05,2011,5,5,201105
6,2011-02-04,2,Friday,7,d_7,No event,No event,2011,2011-02-06,2011,5,5,201105
7,2011-02-05,2,Saturday,1,d_8,No event,No event,2011,2011-02-07,2011,5,6,201106
8,2011-02-06,2,Sunday,2,d_9,SuperBowl,SuperBowl,2011,2011-02-08,2011,5,6,201106
9,2011-02-07,2,Monday,3,d_10,No event,No event,2011,2011-02-09,2011,6,6,201106


In [76]:
df_calendar_final2=df_calendar_final.copy()

In [77]:
var_drop=['year','week','date_false']
df_calendar_final2.drop(var_drop,axis=1,inplace=True)

In [78]:
df_calendar_final2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1913 entries, 0 to 111
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           1913 non-null   datetime64[ns]
 1   mes            1913 non-null   int64         
 2   weekday        1913 non-null   object        
 3   weekday_int    1913 non-null   int64         
 4   d              1913 non-null   object        
 5   event          1913 non-null   object        
 6   festivos_eeuu  1913 non-null   object        
 7   year_false     1913 non-null   object        
 8   week_false     1913 non-null   object        
 9   yearweek       1913 non-null   object        
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 164.4+ KB


In [79]:
# Pasamos a categóricos los tres objetos y a enteros el dia de la semana
cambioFormato(df_calendar_final2,['weekday', 'd', 'event'],'category')
cambioFormato(df_calendar_final2,['weekday_int','year_false','week_false','yearweek'],np.int32)

In [80]:
df_calendar_final2.to_csv(f'{DATA_PATH}/df_calendar_final_20230317.csv')

In [81]:
del df_calendar_final, df_calendar_final2, df_events

In [82]:
del df_events_2011,df_events_2012,df_events_2013,df_events_2014,df_events_2015,df_events_2016

In [83]:
df_calendar = pd.read_csv(f'{DATA_PATH}/df_calendar_final_20230317.csv',sep=",")

## Trabajamos con el calendario limpio

In [84]:
df_calendar.drop('Unnamed: 0',axis=1,inplace=True)
df_calendar['date'] = pd.to_datetime(df_calendar['date'])

In [85]:
df_calendar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1913 entries, 0 to 1912
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           1913 non-null   datetime64[ns]
 1   mes            1913 non-null   int64         
 2   weekday        1913 non-null   object        
 3   weekday_int    1913 non-null   int64         
 4   d              1913 non-null   object        
 5   event          1913 non-null   object        
 6   festivos_eeuu  1913 non-null   object        
 7   year_false     1913 non-null   int64         
 8   week_false     1913 non-null   int64         
 9   yearweek       1913 non-null   int64         
dtypes: datetime64[ns](1), int64(5), object(4)
memory usage: 149.6+ KB


In [86]:
df_calendar.head(10)

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year_false,week_false,yearweek
0,2011-01-29,1,Saturday,1,d_1,No event,No event,2011,5,201105
1,2011-01-30,1,Sunday,2,d_2,No event,No event,2011,5,201105
2,2011-01-31,1,Monday,3,d_3,No event,No event,2011,5,201105
3,2011-02-01,2,Tuesday,4,d_4,No event,No event,2011,5,201105
4,2011-02-02,2,Wednesday,5,d_5,No event,No event,2011,5,201105
5,2011-02-03,2,Thursday,6,d_6,No event,No event,2011,5,201105
6,2011-02-04,2,Friday,7,d_7,No event,No event,2011,5,201105
7,2011-02-05,2,Saturday,1,d_8,No event,No event,2011,6,201106
8,2011-02-06,2,Sunday,2,d_9,SuperBowl,SuperBowl,2011,6,201106
9,2011-02-07,2,Monday,3,d_10,No event,No event,2011,6,201106


In [87]:
df_calendar['yearweek']=df_calendar['yearweek'].astype('object')

In [88]:
# Obtenemos las semanas y lo pasamos a una lista
semanas=df_calendar[['yearweek']].value_counts().to_frame().sort_values(by='yearweek').reset_index()
lista_semanas=semanas['yearweek'].to_list()

In [89]:
# comprobamos que tenemos la última semana incompleta
# Procederemos a borrarla
print(lista_semanas[-1]);
lista_semanas.pop()
print(lista_semanas[-1]);

201617
201616


In [90]:
MIN_DATE = df_calendar["date"].min()
MIN_DATE

Timestamp('2011-01-29 00:00:00')

Agregamos un mes mas para la predicción y generamos un rango con todas las fechas, incluídas las predict (todo mayo 2016)

In [91]:
# DATE_PREDICTION = np.datetime64("2016-05-31")
# date_range = pd.date_range(MIN_DATE, DATE_PREDICTION, freq = "w")
# date_range = pd.date_range('2011-01-28 00:00:00', DATE_PREDICTION, freq = "w")

In [92]:
# Agrupacion de ventas por semana. VOY DIRECTO ABAJO DE PRICES
# TENGO Q HACER EL PRODUCTO CARTESIANO ENTRE PRODUCTOS Y FECHAS
unique_id=df_sales['id'].unique()
len(unique_id)

30490

In [93]:
# Creamos el producto cartesiano
cartesian_product = pd.MultiIndex.from_product([lista_semanas, unique_id], names = ["week", "id"])
len(cartesian_product)

8323770

In [94]:
full_df = pd.DataFrame(index = cartesian_product).reset_index()

In [95]:
full_df.tail(3)

Unnamed: 0,week,id
8323767,201616,SUPERMARKET_3_825_PHI_3
8323768,201616,SUPERMARKET_3_826_PHI_3
8323769,201616,SUPERMARKET_3_827_PHI_3


## Transposición de las columnas
Paso siguiente: a las ventas ponerle fechas --> cruzarlo con calendar, y pasarlo a weeks (**resample**)
y despues lo cruzamos con este full_df(producto cartesiano) 


In [96]:
columns_cat = ['item',
               'category',
               'department',
               'store',
               'store_code',
               'region']

In [97]:
cambioFormato(df_sales, columns_cat, 'category')

In [98]:
%%time
# melt >> es útil para enviar mensajes a un DataFrame en un formato en el que una o más columnas son
# variables de identificación, mientras que todas las demás columnas, consideradas variables medidas,
# no están vinculadas al eje de la fila, dejando solo dos columnas sin identificador, variable y valor. 
### Resumiendo, deshacemos el pivot aplicando los identificadores y los pivotados. ###

# id_vars[tupla, lista o ndarray, opcional] : Columna(s) para usar como variables de identificación. 
# value_vars[tupla, lista o ndarray, opcional]: Columna(s) para anular el pivote. Si no se especifica,
# usa todas las columnas que no están configuradas como id_vars. 
# var_name[scalar]: Nombre a usar para la columna ‘variable’. Si es Ninguno, usa marco.columnas.nombre
# o ‘variable’. 
# value_name[scalar, default ‘value’]: Nombre a usar para la columna ‘value’. 
# col_level[int o string, opcional]: si las columnas son un índice múltiple, use este nivel para derretir. (#nono)

df_sales_melted = df_sales.melt(id_vars=['id',
                                         'item',
                                         'category',
                                         'department',
                                         'store',
                                         'store_code',
                                         'region'],
                                value_vars=df_sales.columns[7:],
                                var_name='d',
                                value_name='sales')

CPU times: user 10.4 s, sys: 2.1 s, total: 12.5 s
Wall time: 13.1 s


In [99]:
df_sales_melted.head(2)

Unnamed: 0,id,item,category,department,store,store_code,region,d,sales
0,ACCESORIES_1_001_NYC_1,ACCESORIES_1_001,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,d_1,0
1,ACCESORIES_1_002_NYC_1,ACCESORIES_1_002,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,d_1,0


In [100]:
df_sales_melted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58327370 entries, 0 to 58327369
Data columns (total 9 columns):
 #   Column      Dtype   
---  ------      -----   
 0   id          object  
 1   item        category
 2   category    category
 3   department  category
 4   store       category
 5   store_code  category
 6   region      category
 7   d           object  
 8   sales       int64   
dtypes: category(6), int64(1), object(2)
memory usage: 1.7+ GB


In [101]:
df_sales_melted.head(4).T

Unnamed: 0,0,1,2,3
id,ACCESORIES_1_001_NYC_1,ACCESORIES_1_002_NYC_1,ACCESORIES_1_003_NYC_1,ACCESORIES_1_004_NYC_1
item,ACCESORIES_1_001,ACCESORIES_1_002,ACCESORIES_1_003,ACCESORIES_1_004
category,ACCESORIES,ACCESORIES,ACCESORIES,ACCESORIES
department,ACCESORIES_1,ACCESORIES_1,ACCESORIES_1,ACCESORIES_1
store,Greenwich_Village,Greenwich_Village,Greenwich_Village,Greenwich_Village
store_code,NYC_1,NYC_1,NYC_1,NYC_1
region,New York,New York,New York,New York
d,d_1,d_1,d_1,d_1
sales,0,0,0,0


In [102]:
# Sin anotación científica
df_sales_melted['sales'].describe().to_frame().apply(lambda x: '%.5f' % x, axis=1)

count    58327370.00000
mean            1.12632
std             3.87311
min             0.00000
25%             0.00000
50%             0.00000
75%             1.00000
max           763.00000
dtype: object

### Trabajamos en el inner join de Calendario y  Ventas

In [103]:
df_calendar.head(2)

Unnamed: 0,date,mes,weekday,weekday_int,d,event,festivos_eeuu,year_false,week_false,yearweek
0,2011-01-29,1,Saturday,1,d_1,No event,No event,2011,5,201105
1,2011-01-30,1,Sunday,2,d_2,No event,No event,2011,5,201105


In [104]:
df_calendar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1913 entries, 0 to 1912
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           1913 non-null   datetime64[ns]
 1   mes            1913 non-null   int64         
 2   weekday        1913 non-null   object        
 3   weekday_int    1913 non-null   int64         
 4   d              1913 non-null   object        
 5   event          1913 non-null   object        
 6   festivos_eeuu  1913 non-null   object        
 7   year_false     1913 non-null   int64         
 8   week_false     1913 non-null   int64         
 9   yearweek       1913 non-null   object        
dtypes: datetime64[ns](1), int64(4), object(5)
memory usage: 149.6+ KB


In [105]:
cambioFormato(df_calendar, ['weekday','d','event'], 'category')

In [106]:
# Comprobamos que las dataframes que vamos a unir tengan la misma
# última semana
print("df_calendar", df_calendar['yearweek'].max());

df_calendar 201617


In [107]:
df_calendar = df_calendar[df_calendar['yearweek'] != 201617]

In [108]:
print("df_calendar", df_calendar['yearweek'].max());

df_calendar 201616


In [109]:
%%time
# Merge con sales con ventas > 0
df_merged_inner = pd.merge(left=df_sales_melted,right=df_calendar, left_on='d', right_on='d')

CPU times: user 14 s, sys: 4.19 s, total: 18.2 s
Wall time: 19.8 s


In [110]:
df_merged_inner.drop('d',axis=1,inplace=True)

In [111]:
df_merged_inner['yearweek'].max()

201616

In [112]:
df_merged_inner['total_transacciones']=df_merged_inner['sales']

In [113]:
df_merged_inner.sample(5)

Unnamed: 0,id,item,category,department,store,store_code,region,sales,date,mes,weekday,weekday_int,event,festivos_eeuu,year_false,week_false,yearweek,total_transacciones
58251472,ACCESORIES_1_336_BOS_2,ACCESORIES_1_336,ACCESORIES,ACCESORIES_1,Roxbury,BOS_2,Boston,1,2016-04-22,4,Friday,7,No event,No event,2016,16,201616,1
14798128,HOME_&_GARDEN_2_236_NYC_4,HOME_&_GARDEN_2_236,HOME_&_GARDEN,HOME_&_GARDEN_2,Brooklyn,NYC_4,New York,0,2012-05-28,5,Monday,3,No event,Memorial day,2012,22,201222,0
54636442,HOME_&_GARDEN_2_316_PHI_3,HOME_&_GARDEN_2_316,HOME_&_GARDEN,HOME_&_GARDEN_2,Queen_Village,PHI_3,Philadelphia,0,2015-12-25,12,Friday,7,No event,Navidad,2015,51,201551,0
5967736,HOME_&_GARDEN_1_285_PHI_1,HOME_&_GARDEN_1_285,HOME_&_GARDEN,HOME_&_GARDEN_1,Midtown_Village,PHI_1,Philadelphia,0,2011-08-12,8,Friday,7,No event,No event,2011,32,201132,0
4480043,HOME_&_GARDEN_1_507_PHI_3,HOME_&_GARDEN_1_507,HOME_&_GARDEN,HOME_&_GARDEN_1,Queen_Village,PHI_3,Philadelphia,0,2011-06-24,6,Friday,7,No event,No event,2011,25,201125,0


In [114]:
df_merged_inner['id'].nunique()

30490

In [115]:
df_merged_inner.groupby('yearweek')['id'].size() #213430 = 30490 id * 7 dias de cada semana

yearweek
201105    213430
201106    213430
201107    213430
201108    213430
201109    213430
           ...  
201612    213430
201613    213430
201614    213430
201615    213430
201616    213430
Name: id, Length: 273, dtype: int64

In [116]:
df_merged_inner.groupby('yearweek')['id'].size().tail(4)
# Se decide borrar la última semana ya que se puede comprobar
# que tenemos falta de información a la hora de hacer el resample()
# ya que no estarían todos los id de productos para completarlo. 
# 201617     60980

yearweek
201613    213430
201614    213430
201615    213430
201616    213430
Name: id, dtype: int64

In [117]:
print(df_merged_inner.shape)
df_merged_inner = df_merged_inner[df_merged_inner['yearweek'] != 201617]
print(df_merged_inner.shape)

(58266390, 18)
(58266390, 18)


In [118]:
df_merged_inner['festivos_eeuu'].value_counts()

No event                   55217390
Pascua                       365880
San Valentin                 182940
Presidents Day               182940
San Patricio                 182940
SuperBowl                    182940
Halloween                    152450
Martin Luther King Jr        152450
NewYear                      152450
Navidad                      152450
Black Friday                 152450
Thanksgiving                 152450
Ramadan starts               152450
Labor day                    152450
dia de la independencia      152450
dia del padre                152450
Memorial day                 152450
dia de la madre              152450
Easter                        91470
amazon prime day              30490
Name: festivos_eeuu, dtype: int64

In [119]:
%%time
# Paso las variables de festivos_eeuu a dummie
# lo que quiero ver en el proximo notebook es impacto
# de ventas en cada dia festivo. Acá hay en 1 igual 
# para todos los items
df_merged_inner = pd.get_dummies(data=df_merged_inner, columns=['festivos_eeuu']) 

CPU times: user 10.1 s, sys: 4.2 s, total: 14.3 s
Wall time: 15.9 s


In [120]:
df_merged_inner.head(3)

Unnamed: 0,id,item,category,department,store,store_code,region,sales,date,mes,...,festivos_eeuu_Presidents Day,festivos_eeuu_Ramadan starts,festivos_eeuu_San Patricio,festivos_eeuu_San Valentin,festivos_eeuu_SuperBowl,festivos_eeuu_Thanksgiving,festivos_eeuu_amazon prime day,festivos_eeuu_dia de la independencia,festivos_eeuu_dia de la madre,festivos_eeuu_dia del padre
0,ACCESORIES_1_001_NYC_1,ACCESORIES_1_001,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,0,2011-01-29,1,...,0,0,0,0,0,0,0,0,0,0
1,ACCESORIES_1_002_NYC_1,ACCESORIES_1_002,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,0,2011-01-29,1,...,0,0,0,0,0,0,0,0,0,0
2,ACCESORIES_1_003_NYC_1,ACCESORIES_1_003,ACCESORIES,ACCESORIES_1,Greenwich_Village,NYC_1,New York,0,2011-01-29,1,...,0,0,0,0,0,0,0,0,0,0


In [121]:
#creamos booleano de evento
df_merged_inner['event_bool']=np.where(df_merged_inner['event']=='No event',0,1)

**Vamos a crear una variable de estación del año para poder analizar la variable en el armado de clusters.**

In [122]:
df_merged_inner['mes']=df_merged_inner['date'].dt.month

In [123]:
estaciones_mapping={
    1: 'invierno', 
    2: 'invierno', 
    3: 'invierno', 
    4: 'primavera', 
    5: 'primavera',
    6: 'primavera', 
    7: 'verano',
    8: 'verano', 
    9: 'verano', 
    10: 'otoño',
    11: 'otoño', 
    12: 'otoño'
} 
    

In [124]:
df_merged_inner['estacion']=df_merged_inner['mes'].map(estaciones_mapping)

In [125]:
from scipy import stats

In [126]:
%%time
## Resample
df_weekly_agg = df_merged_inner.groupby(['yearweek','id']).agg(
    {
        "sales":np.sum,
        "estacion":lambda x: stats.mode(x)[0][0],
        "event_bool":np.max,
        "festivos_eeuu_Presidents Day":np.max,
        "festivos_eeuu_Ramadan starts":np.max,
        "festivos_eeuu_San Patricio":np.max,
        "festivos_eeuu_San Valentin":np.max,
        "festivos_eeuu_SuperBowl":np.max,
        "festivos_eeuu_Thanksgiving":np.max,
        "festivos_eeuu_amazon prime day":np.max,
        "festivos_eeuu_dia de la independencia":np.max,
        "festivos_eeuu_dia de la madre":np.max,
        "festivos_eeuu_dia del padre":np.max,
        
    }
).rename(
    columns = {
        "sales":"weekly_sales",
        "estacion":"estacion",
        "event_bool":"week_event",
        "festivos_eeuu_Presidents Day":'Predidents_day',
        "festivos_eeuu_Ramadan starts":'Ramadan_starts',
        "festivos_eeuu_San Patricio":'San_Patricio',
        "festivos_eeuu_San Valentin":'San_Valentin',
        "festivos_eeuu_SuperBowl":'Superbowl',
        "festivos_eeuu_Thanksgiving":'Thanksgiving',
        "festivos_eeuu_amazon prime day":'Amazon_prime_day',
        "festivos_eeuu_dia de la independencia":'dia_de_la_indep',
        "festivos_eeuu_dia de la madre":'dia_madre',
        "festivos_eeuu_dia del padre":'dia_padre'
        
        
        
    }
).reset_index()



CPU times: user 7min 2s, sys: 21.4 s, total: 7min 24s
Wall time: 7min 8s


In [127]:
df_weekly_agg.head(5)

Unnamed: 0,yearweek,id,weekly_sales,estacion,week_event,Predidents_day,Ramadan_starts,San_Patricio,San_Valentin,Superbowl,Thanksgiving,Amazon_prime_day,dia_de_la_indep,dia_madre,dia_padre
0,201105,ACCESORIES_1_001_BOS_1,0,invierno,0,0,0,0,0,0,0,0,0,0,0
1,201105,ACCESORIES_1_001_BOS_2,0,invierno,0,0,0,0,0,0,0,0,0,0,0
2,201105,ACCESORIES_1_001_BOS_3,0,invierno,0,0,0,0,0,0,0,0,0,0,0
3,201105,ACCESORIES_1_001_NYC_1,0,invierno,0,0,0,0,0,0,0,0,0,0,0
4,201105,ACCESORIES_1_001_NYC_2,0,invierno,0,0,0,0,0,0,0,0,0,0,0


In [128]:
Q_datos_por_estacion=df_weekly_agg['estacion'].value_counts().to_frame()

In [129]:
Q_datos_por_estacion.head(4)

Unnamed: 0,estacion
invierno,2225770
primavera,2073320
verano,2012340
otoño,2012340


In [130]:
df_weekly_agg.tail(4)

Unnamed: 0,yearweek,id,weekly_sales,estacion,week_event,Predidents_day,Ramadan_starts,San_Patricio,San_Valentin,Superbowl,Thanksgiving,Amazon_prime_day,dia_de_la_indep,dia_madre,dia_padre
8323766,201616,SUPERMARKET_3_827_NYC_4,10,primavera,0,0,0,0,0,0,0,0,0,0,0
8323767,201616,SUPERMARKET_3_827_PHI_1,25,primavera,0,0,0,0,0,0,0,0,0,0,0
8323768,201616,SUPERMARKET_3_827_PHI_2,13,primavera,0,0,0,0,0,0,0,0,0,0,0
8323769,201616,SUPERMARKET_3_827_PHI_3,0,primavera,0,0,0,0,0,0,0,0,0,0,0


In [131]:
## 273 semanas totales (recordar que borramos la última semana)
df_weekly_agg['yearweek'].nunique()

273

In [132]:
# Tomamos la información que necesitamos para mas tarde
add_info=df_merged_inner[['id',
                          'item',
                          'category',
                          'store',
                          'store_code',
                          'region']].drop_duplicates()

In [133]:
add_info.head(3)

Unnamed: 0,id,item,category,store,store_code,region
0,ACCESORIES_1_001_NYC_1,ACCESORIES_1_001,ACCESORIES,Greenwich_Village,NYC_1,New York
1,ACCESORIES_1_002_NYC_1,ACCESORIES_1_002,ACCESORIES,Greenwich_Village,NYC_1,New York
2,ACCESORIES_1_003_NYC_1,ACCESORIES_1_003,ACCESORIES,Greenwich_Village,NYC_1,New York


In [134]:
%%time
# Unimos el full_df que era el productor cartesiano de semanas y id

print(full_df.shape)

full_df = pd.merge(
    left = full_df,
    right = add_info,
    how = "left",
    on = "id"
)

print(full_df.shape)

(8323770, 2)
(8323770, 7)
CPU times: user 1.86 s, sys: 220 ms, total: 2.08 s
Wall time: 2.14 s


In [135]:
full_df.rename(columns={'week':'yearweek'},inplace=True)
full_df.head(3)

Unnamed: 0,yearweek,id,item,category,store,store_code,region
0,201105,ACCESORIES_1_001_NYC_1,ACCESORIES_1_001,ACCESORIES,Greenwich_Village,NYC_1,New York
1,201105,ACCESORIES_1_002_NYC_1,ACCESORIES_1_002,ACCESORIES,Greenwich_Village,NYC_1,New York
2,201105,ACCESORIES_1_003_NYC_1,ACCESORIES_1_003,ACCESORIES,Greenwich_Village,NYC_1,New York


In [136]:
# vemos nulos
df_weekly_agg.isnull().sum()

yearweek            0
id                  0
weekly_sales        0
estacion            0
week_event          0
Predidents_day      0
Ramadan_starts      0
San_Patricio        0
San_Valentin        0
Superbowl           0
Thanksgiving        0
Amazon_prime_day    0
dia_de_la_indep     0
dia_madre           0
dia_padre           0
dtype: int64

In [137]:
full_df['yearweek'].max()

201616

In [138]:
df_weekly_agg['yearweek'].max()

201616

In [139]:
%%time
# Join por la izquierda.
print(full_df.shape)
full_df = pd.merge(full_df, df_weekly_agg, on = ['yearweek','id'], how = 'left')
print(full_df.shape)

(8323770, 7)
(8323770, 20)
CPU times: user 3.59 s, sys: 573 ms, total: 4.17 s
Wall time: 4.23 s


In [140]:
full_df.head(5)

Unnamed: 0,yearweek,id,item,category,store,store_code,region,weekly_sales,estacion,week_event,Predidents_day,Ramadan_starts,San_Patricio,San_Valentin,Superbowl,Thanksgiving,Amazon_prime_day,dia_de_la_indep,dia_madre,dia_padre
0,201105,ACCESORIES_1_001_NYC_1,ACCESORIES_1_001,ACCESORIES,Greenwich_Village,NYC_1,New York,0,invierno,0,0,0,0,0,0,0,0,0,0,0
1,201105,ACCESORIES_1_002_NYC_1,ACCESORIES_1_002,ACCESORIES,Greenwich_Village,NYC_1,New York,0,invierno,0,0,0,0,0,0,0,0,0,0,0
2,201105,ACCESORIES_1_003_NYC_1,ACCESORIES_1_003,ACCESORIES,Greenwich_Village,NYC_1,New York,0,invierno,0,0,0,0,0,0,0,0,0,0,0
3,201105,ACCESORIES_1_004_NYC_1,ACCESORIES_1_004,ACCESORIES,Greenwich_Village,NYC_1,New York,0,invierno,0,0,0,0,0,0,0,0,0,0,0
4,201105,ACCESORIES_1_005_NYC_1,ACCESORIES_1_005,ACCESORIES,Greenwich_Village,NYC_1,New York,0,invierno,0,0,0,0,0,0,0,0,0,0,0


In [141]:
full_df.isnull().sum()

yearweek            0
id                  0
item                0
category            0
store               0
store_code          0
region              0
weekly_sales        0
estacion            0
week_event          0
Predidents_day      0
Ramadan_starts      0
San_Patricio        0
San_Valentin        0
Superbowl           0
Thanksgiving        0
Amazon_prime_day    0
dia_de_la_indep     0
dia_madre           0
dia_padre           0
dtype: int64

In [143]:
# Ya con esta tabla, la tenemos que unir con la de precio asi sacamos el revenue

In [144]:
cambioFormato(df_prices,['item', 'category', 'store_code'],'category')

In [145]:
# vemos datos generales del dataset de precios
datosGenerales(df_prices)

Su shape (6965706, 5)
El número de duplicados es  212120


item             0 nulos ________
category         0 nulos ________
store_code       0 nulos ________
yearweek      243920 nulos, 3.5 %
sell_price       0 nulos ________
dtype: object

In [146]:
df_prices.dropna(inplace=True)

In [147]:
df_prices.sort_values(by='yearweek')

Unnamed: 0,item,category,store_code,yearweek,sell_price
5539160,SUPERMARKET_3_702,SUPERMARKET,PHI_1,201105.0,3.9360
6613947,HOME_&_GARDEN_2_445,HOME_&_GARDEN,PHI_3,201105.0,8.1000
261069,HOME_&_GARDEN_2_041,HOME_&_GARDEN,NYC_1,201105.0,6.5875
1516539,HOME_&_GARDEN_1_117,HOME_&_GARDEN,NYC_3,201105.0,11.2125
563249,SUPERMARKET_3_188,SUPERMARKET,NYC_1,201105.0,2.3760
...,...,...,...,...,...
2386874,HOME_&_GARDEN_2_338,HOME_&_GARDEN,NYC_4,201617.0,6.1500
2387152,HOME_&_GARDEN_2_339,HOME_&_GARDEN,NYC_4,201617.0,8.0750
6156810,SUPERMARKET_3_382,SUPERMARKET,PHI_2,201617.0,2.1360
2386079,HOME_&_GARDEN_2_334,HOME_&_GARDEN,NYC_4,201617.0,3.7250


In [151]:
# Eliminamos la semana 201617 por ser incompleta
df_prices = df_prices[df_prices['yearweek'] != 201617 ]

In [153]:
full_df['item'].nunique()

3049

## Final merge para tener 1 sola tabla

In [154]:
#%%time
print(full_df.shape)
df_final = pd.merge(full_df, df_prices, on = ['yearweek',
                                              'item',
                                              'category',
                                              'store_code'],
                    how = 'left')
print(df_final.shape)

(8323770, 20)
(8323770, 21)


In [155]:
df_final.isnull().sum()

yearweek                  0
id                        0
item                      0
category                  0
store                     0
store_code                0
region                    0
weekly_sales              0
estacion                  0
week_event                0
Predidents_day            0
Ramadan_starts            0
San_Patricio              0
San_Valentin              0
Superbowl                 0
Thanksgiving              0
Amazon_prime_day          0
dia_de_la_indep           0
dia_madre                 0
dia_padre                 0
sell_price          1757059
dtype: int64

In [156]:
%%time
# Debido a los nulos en precios, los completamos con los de la semana anterior
df_final["sell_price"] = df_final.groupby(["item",
                                           'category',
                                           'store_code'])["sell_price"].apply(lambda series:
                                                                              series.backfill().ffill())

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)


CPU times: user 5.6 s, sys: 559 ms, total: 6.16 s
Wall time: 6.35 s


In [157]:
df_final['revenue']=df_final['weekly_sales']*df_final['sell_price']

In [158]:
df_final.sample(5)

Unnamed: 0,yearweek,id,item,category,store,store_code,region,weekly_sales,estacion,week_event,...,San_Patricio,San_Valentin,Superbowl,Thanksgiving,Amazon_prime_day,dia_de_la_indep,dia_madre,dia_padre,sell_price,revenue
8106371,201609,SUPERMARKET_2_303_PHI_2,SUPERMARKET_2_303,SUPERMARKET,Yorktown,PHI_2,Philadelphia,0,invierno,0,...,0,0,0,0,0,0,0,0,2.616,0.0
8135287,201610,ACCESORIES_2_140_PHI_2,ACCESORIES_2_140,ACCESORIES,Yorktown,PHI_2,Philadelphia,3,invierno,0,...,0,0,0,0,0,0,0,0,3.2851,9.8553
3242988,201306,SUPERMARKET_2_074_NYC_4,SUPERMARKET_2_074,SUPERMARKET,Brooklyn,NYC_4,New York,20,invierno,0,...,0,1,0,0,0,0,0,0,3.0,60.0
1602251,201205,HOME_&_GARDEN_2_431_BOS_2,HOME_&_GARDEN_2_431,HOME_&_GARDEN,Roxbury,BOS_2,Boston,5,invierno,0,...,0,0,0,0,0,0,0,0,11.8375,59.1875
755874,201129,SUPERMARKET_3_547_PHI_1,SUPERMARKET_3_547,SUPERMARKET,Midtown_Village,PHI_1,Philadelphia,0,verano,0,...,0,0,0,0,0,0,0,0,0.24,0.0


In [159]:
cambioFormato(df_final,['yearweek','weekly_sales'],np.int32)

In [160]:
datosGenerales(df_final)

Su shape (8323770, 22)
El número de duplicados es  0


yearweek            0 nulos ________
id                  0 nulos ________
item                0 nulos ________
category            0 nulos ________
store               0 nulos ________
store_code          0 nulos ________
region              0 nulos ________
weekly_sales        0 nulos ________
estacion            0 nulos ________
week_event          0 nulos ________
Predidents_day      0 nulos ________
Ramadan_starts      0 nulos ________
San_Patricio        0 nulos ________
San_Valentin        0 nulos ________
Superbowl           0 nulos ________
Thanksgiving        0 nulos ________
Amazon_prime_day    0 nulos ________
dia_de_la_indep     0 nulos ________
dia_madre           0 nulos ________
dia_padre           0 nulos ________
sell_price          0 nulos ________
revenue             0 nulos ________
dtype: object

In [161]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8323770 entries, 0 to 8323769
Data columns (total 22 columns):
 #   Column            Dtype   
---  ------            -----   
 0   yearweek          int32   
 1   id                object  
 2   item              category
 3   category          category
 4   store             category
 5   store_code        category
 6   region            category
 7   weekly_sales      int32   
 8   estacion          object  
 9   week_event        int64   
 10  Predidents_day    uint8   
 11  Ramadan_starts    uint8   
 12  San_Patricio      uint8   
 13  San_Valentin      uint8   
 14  Superbowl         uint8   
 15  Thanksgiving      uint8   
 16  Amazon_prime_day  uint8   
 17  dia_de_la_indep   uint8   
 18  dia_madre         uint8   
 19  dia_padre         uint8   
 20  sell_price        float64 
 21  revenue           float64 
dtypes: category(5), float64(2), int32(2), int64(1), object(2), uint8(10)
memory usage: 571.6+ MB


In [173]:
df_final['sell_price'] = df_final['sell_price'].round(2)
df_final['revenue'] = df_final['revenue'].round(2)

In [174]:
pd.to_pickle(df_final,f'{DATA_PATH}/df_final20230317.pkl')

In [None]:
#df_final.to_csv(f'{DATA_PATH}/df_final.csv')

In [None]:
## df_lectura = pd.read_pickle(f'{DATA_PATH}/df_unido.pkl')