# Analysis of Chemicals in Cosmetic Products
# Forecasting
___

## 1. Load data and Creation of Validation Dataset

#### Import libraries

In [31]:
%pylab
%matplotlib inline

%config InlineBackend.figure_format = 'retina'

import os
import sys
import numpy    as np
import pandas   as pd
import datetime

from matplotlib             import pyplot
from dateutil.relativedelta import relativedelta

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


#### Declare constants 

In [2]:
# Variables with the names of each column in data
CDPHID_COLUMN                 = 'CDPHId'
PRODUCTNAME_COLUMN            = 'ProductName'
CSFID_COLUMN                  = 'CSFId'
CSF_COLUMN                    = 'CSF'
COMPANYID_COLUMN              = 'CompanyId'
COMPANYNAME_COLUMN            = 'CompanyName'
BRANDNAME_COLUMN              = 'BrandName'
PRIMARYCATEGORYID_COLUMN      = 'PrimaryCategoryId'
PRIMARYCATEGORY_COLUMN        = 'PrimaryCategory'
SUBCATEGORYID_COLUMN          = 'SubCategoryId'
SUBCATEGORY_COLUMN            = 'SubCategory'
CASID_COLUMN                  = 'CasId'
CASNUMBER_COLUMN              = 'CasNumber'
CHEMICALID_COLUMN             = 'ChemicalId'
CHEMICALNAME_COLUMN           = 'ChemicalName'
INITIALDATEREPORTED_COLUMN    = 'InitialDateReported'
MOSTRECENTDATEREPORTED_COLUMN = 'MostRecentDateReported'
DISCONTINUEDDATE_COLUMN       = 'DiscontinuedDate'
CHEMICALCREATEDAT_COLUMN      = 'ChemicalCreatedAt'
CHEMICALUPDATEDAT_COLUMN      = 'ChemicalUpdatedAt'
CHEMICALDATEREMOVED_COLUMN    = 'ChemicalDateRemoved'
CHEMICALCOUNT_COLUMN          = 'ChemicalCount'

# Number to replace empty value in CSFId column
CSFID_EMPTY_NUMBER  = -1
# Date value which means NaT 
NAT_DATE            = datetime.date(1900,1,1)

# List of columns which contains a Names
list_names = [ PRODUCTNAME_COLUMN, CSF_COLUMN, COMPANYNAME_COLUMN, BRANDNAME_COLUMN, 
               PRIMARYCATEGORY_COLUMN, SUBCATEGORY_COLUMN, CHEMICALNAME_COLUMN ]

# List of columns which contains a Dates
list_dates = [ INITIALDATEREPORTED_COLUMN, MOSTRECENTDATEREPORTED_COLUMN, DISCONTINUEDDATE_COLUMN,
               CHEMICALCREATEDAT_COLUMN, CHEMICALUPDATEDAT_COLUMN, CHEMICALDATEREMOVED_COLUMN ]

# Other columns
list_ids   = [ CDPHID_COLUMN, CSFID_COLUMN, COMPANYID_COLUMN, PRIMARYCATEGORYID_COLUMN, 
               SUBCATEGORYID_COLUMN, CASID_COLUMN, CHEMICALID_COLUMN ]

# Path to save the images
IMAGES_PATH   = 'img'

# Format of images
IMAGES_FORMAT = '.png'

# Path where is the dataset
DATA_PATH = 'data'

# Path where is the backup dataset
DATA_BACKUP_PATH = 'data_backup'

# File name of dataset
DATA_NAME = 'cscpopendata.csv'

---

### 1.1. Load data

#### From data/ folder 

In [36]:
# Load data from data/ folder
data = pd.read_csv(os.path.join(DATA_PATH, DATA_NAME), sep = ',', parse_dates = list_dates)

# pd.isna() --> for NaN
# <value> is pd.NaT --> for NaT

# Fill NaT with 01/01/1900
for date_column in list_dates :
    data[date_column] = data[date_column].apply(lambda x: NAT_DATE if x is pd.NaT else x)
    

# Fill NA with ''
data = data.fillna('')

# Fill CSFId empty with -1
data[CSFID_COLUMN] = data[CSFID_COLUMN].apply(lambda x: x if x != "" else CSFID_EMPTY_NUMBER)

# Remove all rows that have a cosmetic product discontinued
data = data[data[DISCONTINUEDDATE_COLUMN].apply(lambda x: x.year) == NAT_DATE.year]

# Remove all rows that have a chemical removed
data = data[data[CHEMICALDATEREMOVED_COLUMN].apply(lambda x: x.year) == NAT_DATE.year]

# Group by InitialDateReported and sum ChemicalCount
data = pd.DataFrame(data.groupby(INITIALDATEREPORTED_COLUMN)
                    .agg({CHEMICALCOUNT_COLUMN : 'sum'}))

#### From data_backup/ folder 

In [38]:
# Load data from data_backup/ folder
data_backup = pd.read_csv(os.path.join(DATA_BACKUP_PATH, DATA_NAME), sep = ',', parse_dates = list_dates)

---

### 1.2. Creation of Validation Dataset

As the dataset is updated frequently, we are going to use it to create a validation dataset with the new data. We storage two data version: 

- The last data version in `data/` folder.
- The previous data version in `data_backup/` folder.

With that, we are going to split data as following:

- `dataset` variable will be the dataset with we are going to fit the model. This dataset will contain all data until the 5 previous months from the current month. Meaning, if the current month is 02/19 (Feb, 2019), this dataset will be contain all data until the month 09/18 (Sept, 2018): the last date in dataset will be 31/08/18 (Aug 31, 2018).
- `validation` variable will be the dataset with we are going to validate the model. This dataset will contain the rest of data.

Let's create both datasets:

In [42]:
# Get the max InitialDateReported from data
data_max = max(data.index)

# Get the max InitialDateReported from data_backup
data_backup_max = max(data_backup[INITIALDATEREPORTED_COLUMN])

print('data        | Max InitialDateReported = ', data_max)
print('data_backup | Max InitialDateReported = ', data_backup_max)

data        | Max InitialDateReported =  2019-02-21 00:00:00
data_backup | Max InitialDateReported =  2019-02-06 00:00:00


In [54]:
# Set the dataset_limit
dataset_limit = datetime.datetime(data_backup_max.year, data_backup_max.month, 1) - relativedelta(months=5)

print('Current month =', datetime.datetime(data_backup_max.year, data_backup_max.month, 1))
print('Dataset limit =', dataset_limit)

Current month = 2019-02-01 00:00:00
Dataset limit = 2018-09-01 00:00:00


In [55]:
# Set the datasets with the dataset_limit
dataset = data[data.index < dataset_limit]
validation = data[data.index >= dataset_limit]

print('Len(dataset) =', len(dataset))
print('Len(validation) =', len(validation))

Len(dataset) = 1749
Len(validation) = 114


Our current datetime data can be tricky to work with, therefore, we will use the averages daily chemical count for that month instead, and we are using the start of each month as the timestamp.

In [56]:
dataset.head()

Unnamed: 0_level_0,ChemicalCount
InitialDateReported,Unnamed: 1_level_1
2009-07-01,4
2009-07-09,1
2009-07-13,6
2009-07-14,18
2009-08-10,4


In [60]:
dataset = dataset.resample('MS').mean()
validation = validation.resample('MS').mean()

print('Len(dataset) =', len(dataset))
print('Len(validation) =', len(validation))
dataset.head()

Len(dataset) = 110
Len(validation) = 6


Unnamed: 0_level_0,ChemicalCount
InitialDateReported,Unnamed: 1_level_1
2009-07-01,7.25
2009-08-01,31.909091
2009-09-01,237.416667
2009-10-01,779.08
2009-11-01,46.75


---