# Analysis of Chemicals in Cosmetic Products
# Forecasting
___

## 1. Load data and aggregation 

#### Import libraries

In [1]:
%pylab
%matplotlib inline

%config InlineBackend.figure_format = 'retina'

import os
import sys
import numpy    as np
import pandas   as pd
import datetime

# Use Plotly for statistics graphics
import plotly.plotly         as py
import plotly.graph_objs     as go
import plotly.offline        as offline
import plotly.io             as pio

# Set notebook mode = True
offline.init_notebook_mode(connected=True)

from sklearn.cluster       import KMeans
from sklearn.metrics       import silhouette_score
from sklearn.linear_model  import LinearRegression
from IPython.display       import Image

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


#### Declare constants 

In [2]:
# Variables with the names of each column in data
CDPHID_COLUMN                 = 'CDPHId'
PRODUCTNAME_COLUMN            = 'ProductName'
CSFID_COLUMN                  = 'CSFId'
CSF_COLUMN                    = 'CSF'
COMPANYID_COLUMN              = 'CompanyId'
COMPANYNAME_COLUMN            = 'CompanyName'
BRANDNAME_COLUMN              = 'BrandName'
PRIMARYCATEGORYID_COLUMN      = 'PrimaryCategoryId'
PRIMARYCATEGORY_COLUMN        = 'PrimaryCategory'
SUBCATEGORYID_COLUMN          = 'SubCategoryId'
SUBCATEGORY_COLUMN            = 'SubCategory'
CASID_COLUMN                  = 'CasId'
CASNUMBER_COLUMN              = 'CasNumber'
CHEMICALID_COLUMN             = 'ChemicalId'
CHEMICALNAME_COLUMN           = 'ChemicalName'
INITIALDATEREPORTED_COLUMN    = 'InitialDateReported'
MOSTRECENTDATEREPORTED_COLUMN = 'MostRecentDateReported'
DISCONTINUEDDATE_COLUMN       = 'DiscontinuedDate'
CHEMICALCREATEDAT_COLUMN      = 'ChemicalCreatedAt'
CHEMICALUPDATEDAT_COLUMN      = 'ChemicalUpdatedAt'
CHEMICALDATEREMOVED_COLUMN    = 'ChemicalDateRemoved'
CHEMICALCOUNT_COLUMN          = 'ChemicalCount'

# Number to replace empty value in CSFId column
CSFID_EMPTY_NUMBER  = -1
# Date value which means NaT 
NAT_DATE            = datetime.date(1900,1,1)

# List of columns which contains a Names
list_names = [ PRODUCTNAME_COLUMN, CSF_COLUMN, COMPANYNAME_COLUMN, BRANDNAME_COLUMN, 
               PRIMARYCATEGORY_COLUMN, SUBCATEGORY_COLUMN, CHEMICALNAME_COLUMN ]

# List of columns which contains a Dates
list_dates = [ INITIALDATEREPORTED_COLUMN, MOSTRECENTDATEREPORTED_COLUMN, DISCONTINUEDDATE_COLUMN,
               CHEMICALCREATEDAT_COLUMN, CHEMICALUPDATEDAT_COLUMN, CHEMICALDATEREMOVED_COLUMN ]

# Other columns
list_ids   = [ CDPHID_COLUMN, CSFID_COLUMN, COMPANYID_COLUMN, PRIMARYCATEGORYID_COLUMN, 
               SUBCATEGORYID_COLUMN, CASID_COLUMN, CHEMICALID_COLUMN ]

# Path to save the images
IMAGES_PATH   = 'img'

# Format of images
IMAGES_FORMAT = '.png'

# Function to get completed path to an image
def get_path(name) :
    global IMAGES_PATH, IMAGES_FORMAT
    
    return os.path.join(IMAGES_PATH, name + IMAGES_FORMAT)

---

### 1.1. Load data

In [36]:
# Load data from data/ folder
data = pd.read_csv('data/cscpopendata.csv', sep = ',', parse_dates = list_dates)

# pd.isna() --> for NaN
# <value> is pd.NaT --> for NaT

# Fill NaT with 01/01/1900
for date_column in list_dates :
    data[date_column] = data[date_column].apply(lambda x: NAT_DATE if x is pd.NaT else x)
    

# Fill NA with ''
data = data.fillna('')

# Fill CSFId empty with -1
data[CSFID_COLUMN] = data[CSFID_COLUMN].apply(lambda x: x if x != "" else CSFID_EMPTY_NUMBER)

data.head()

Unnamed: 0,CDPHId,ProductName,CSFId,CSF,CompanyId,CompanyName,BrandName,PrimaryCategoryId,PrimaryCategory,SubCategoryId,...,CasNumber,ChemicalId,ChemicalName,InitialDateReported,MostRecentDateReported,DiscontinuedDate,ChemicalCreatedAt,ChemicalUpdatedAt,ChemicalDateRemoved,ChemicalCount
0,2,ULTRA COLOR RICH EXTRA PLUMP LIPSTICK-ALL SHADES,-1.0,,4,New Avon LLC,AVON,44,Makeup Products (non-permanent),53,...,13463-67-7,6,Titanium dioxide,2009-06-17,2013-08-28,2011-02-01,2009-07-09,2009-07-09,1900-01-01,1
1,3,Glover's Medicated Shampoo,-1.0,,338,J. Strickland & Co.,Glover's,18,Hair Care Products (non-coloring),25,...,65996-92-1,4,Distillates (coal tar),2009-07-01,2009-07-01,1900-01-01,2009-07-01,2009-07-01,1900-01-01,2
2,3,Glover's Medicated Shampoo,-1.0,,338,J. Strickland & Co.,Glover's,18,Hair Care Products (non-coloring),25,...,140-67-0,5,Estragole,2009-07-01,2009-07-01,1900-01-01,2009-07-02,2009-07-02,1900-01-01,2
3,4,PRECISION GLIMMER EYE LINER-ALL SHADES �,-1.0,,4,New Avon LLC,AVON,44,Makeup Products (non-permanent),46,...,13463-67-7,7,Titanium dioxide,2009-07-09,2013-08-28,1900-01-01,2009-07-09,2009-07-09,1900-01-01,1
4,5,AVON BRILLIANT SHINE LIP GLOSS-ALL SHADES �,-1.0,,4,New Avon LLC,AVON,44,Makeup Products (non-permanent),52,...,13463-67-7,8,Titanium dioxide,2009-07-09,2013-08-28,2011-02-01,2009-07-09,2009-07-09,1900-01-01,1


---

### 1.2. Data aggregation 

In [12]:
# Remove all rows that have a cosmetic product discontinued
to_aggregate = data[data[DISCONTINUEDDATE_COLUMN].apply(lambda x: x.year) == NAT_DATE.year]

# Remove all rows that have a chemical removed
to_aggregate = to_aggregate[to_aggregate[CHEMICALDATEREMOVED_COLUMN].apply(lambda x: x.year) == NAT_DATE.year]

In [13]:
# Aggregate InitialDateReported
data_initialdate = pd.DataFrame(to_aggregate.groupby(INITIALDATEREPORTED_COLUMN, as_index = False)
                                               .agg({CHEMICALCOUNT_COLUMN : 'sum'}))
data_initialdate.head()

Unnamed: 0,InitialDateReported,ChemicalCount
0,2009-07-01,4
1,2009-07-09,1
2,2009-07-13,6
3,2009-07-14,18
4,2009-08-10,4


In [14]:
# Aggregate MostRecentDateReported
data_mostrecentdate = pd.DataFrame(to_aggregate.groupby(MOSTRECENTDATEREPORTED_COLUMN, as_index = False)
                                               .agg({CHEMICALCOUNT_COLUMN : 'sum'}))
data_mostrecentdate.head()

Unnamed: 0,MostRecentDateReported,ChemicalCount
0,2009-07-01,4
1,2009-08-11,1
2,2009-08-18,1
3,2009-08-21,6
4,2009-08-24,1


In [15]:
# Aggregate ChemicalUpdatedAt
data_updatedat = pd.DataFrame(to_aggregate.groupby(CHEMICALUPDATEDAT_COLUMN, as_index = False)
                                               .agg({CHEMICALCOUNT_COLUMN : 'sum'}))
data_updatedat.head()

Unnamed: 0,ChemicalUpdatedAt,ChemicalCount
0,2009-07-01,2
1,2009-07-02,2
2,2009-07-09,1
3,2009-08-10,4
4,2009-08-16,8


736330

In [45]:
y = pd.DataFrame(to_aggregate.groupby(INITIALDATEREPORTED_COLUMN)
                                               .agg({CHEMICALCOUNT_COLUMN : 'sum'}))

In [59]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
from random import random
# contrived dataset
data = y
# fit model
model = SARIMAX(data, order=(1, 1, 1), seasonal_order=(1, 1, 1, 1))
model_fit = model.fit(disp=False)
# make prediction
yhat = model_fit.predict()
print(yhat)


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.



InitialDateReported
2009-07-01      0.000000
2009-07-09      6.004995
2009-07-13     -1.746975
2009-07-14      7.385601
2009-08-10     22.754874
2009-08-11      2.714339
2009-08-16      1.450073
2009-08-18      9.438886
2009-08-21      0.586820
2009-08-24      6.706012
2009-08-25     12.089746
2009-08-26    145.295817
2009-08-27     29.103116
2009-08-28     72.355696
2009-08-31     33.871999
2009-09-01     50.818523
2009-09-02     67.122079
2009-09-03     85.214087
2009-09-04     69.737046
2009-09-07     50.736538
2009-09-08    540.300106
2009-09-09     35.144306
2009-09-10    121.284828
2009-09-11    121.939234
2009-09-14    725.601047
2009-09-15    139.470189
2009-09-16    255.308541
2009-09-17    420.721316
2009-09-18    171.359608
2009-09-19    133.392657
                 ...    
2018-12-29     58.003863
2018-12-30     18.511220
2018-12-31     10.618548
2019-01-01     87.090655
2019-01-02     -1.092168
2019-01-03     11.354449
2019-01-04    179.692841
2019-01-06     52.619184
2019-

In [50]:
len(data)

1855

In [None]:
# Let's plot a histogram of CASID_COLUMN
data_plot = [
    go.Scatter(
        x = data_initialdate[INITIALDATEREPORTED_COLUMN].apply(lambda x: datetime.datetime.strftime(x, '%Y-%m-%d')).values,
        y = data_initialdate[CHEMICALCOUNT_COLUMN].values,
        name = 'Initial Date Reported'
    )#,
#    go.Scatter(
#        x = data_mostrecentdate[MOSTRECENTDATEREPORTED_COLUMN].apply(lambda x: datetime.datetime.strftime(x, '%Y-%m-%d')).values,
#        y = data_mostrecentdate[CHEMICALCOUNT_COLUMN].values,
#        name = 'Most Recent Date Reported'
#    ),
#    go.Scatter(
#        x = data_updatedat[CHEMICALUPDATEDAT_COLUMN].apply(lambda x: datetime.datetime.strftime(x, '%Y-%m-%d')).values,
#        y = data_updatedat[CHEMICALCOUNT_COLUMN].values,
#        name = 'Chemical Updated At'
#    )
]

layout = go.Layout(
    title='Chemical Count in Cosmetic Products',
    xaxis=dict(
        title='Date'
    ),
    yaxis=dict(
        title='Chemical Count'
    ),
    showlegend = True,
    plot_bgcolor='rgba(240,240,240, 0.95)'
)

fig = go.Figure(data=data_plot, layout=layout)
offline.iplot(fig)
#pio.write_image(fig, get_path('histogram_casid_basic'))
#Image(get_path('histogram_casid_basic'))