## Overview
### Data
I am using a Kaggle dataset from the ([link](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations)) it contains:

- articles.csv: which contains information about fashion items.
- customers.csv: which contains information about users.
- transactions_train.csv: which contains information about transactions.

### Preprocessing
1. Removing nulls
2. Converting variables to there appropriate types 
3. Grouping ages
4. Imputing nulls (if possible)

In [None]:
import pandas as pd
import numpy as np

import great_expectations as gx
from great_expectations import ExpectationSuite
import preprocessing

ModuleNotFoundError: No module named 'great_expectations.dataset'

## Articles

In [6]:
articles_data_df = pd.read_csv("data/articles.csv", encoding="utf-8")
articles_data_df.head(2)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.


In [7]:
articles_data_df.shape

(105542, 25)

In [8]:
articles_df = preprocessing.preprocess_articles(articles_data_df)

In [9]:
print(articles_df.shape)
articles_df.head(2)

(105542, 24)


Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic


## Customers

In [10]:
customers_data_df = pd.read_csv("data/customers.csv", encoding="utf-8")
customers_data_df.head(2)

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...


In [11]:
customers_data_df.shape

(1371980, 7)

In [12]:
customers_df = preprocessing.preprocess_customers(customers_data_df)

In [13]:
print(customers_df.shape)
customers_df.head(2)

(1356119, 5)


Unnamed: 0,customer_id,club_member_status,age,postal_code,age_group
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,ACTIVE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...,46-55
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,ACTIVE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...,19-25


## Transactions

In [14]:
transaction_data_df = pd.read_csv("data/transactions_train.csv", encoding="utf-8")
transaction_data_df.head(2)

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2


In [15]:
transaction_data_df.shape

(31788324, 5)

In [26]:
%load_ext autoreload
%autoreload 2
import preprocessing


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
transaction_df = preprocessing.preprocess_transactions(transaction_data_df)

In [28]:
print(transaction_df.shape)
transaction_df.head(2)

(31788324, 11)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,year,month,day,day_of_week,month_sin,month_cos
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,2018,9,20,3,-0.866025,-0.5
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,2018,9,20,3,-0.866025,-0.5


## Great Expectations

In [None]:
context = gx.get_context()
gx_source_name = "dataframe_source"
gx_source_name = "pd customers asset"
data_source = context.data_sources.add_pandas(gx_source_name)
data_asset = data_source.add_dataframe_asset(name="customer_df")

In [48]:
suite_name = "customers_suite"
customers_suite = gx.ExpectationSuite(name=suite_name)

In [47]:
batch_definition = data_asset.add_batch_definition_whole_dataframe("customers batch")
batch = batch_definition.get_batch(batch_parameters={"dataframe": customers_df})

In [None]:
customers_suite = context.suites.add(customers_suite)

AttributeError: 'DataFrameAsset' object has no attribute 'suites'

In [60]:
customers_suite.add_expectation(gx.expectations.ExpectColumnValuesToBeBetween(column="age", min_value=1, max_value=101))

ExpectColumnValuesToBeBetween(id='2a6b3acd-a0b5-4acd-abaf-12f35e78530c', meta=None, notes=None, result_format=<ResultFormat.BASIC: 'BASIC'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='age', mostly=1, row_condition=None, condition_parser=None, min_value=1.0, max_value=101.0, strict_min=False, strict_max=False)

In [56]:
for column in customers_df.columns:
    customers_suite.add_expectation(gx.expectations.ExpectColumnValuesToBeNull(column=column, mostly=0))

In [None]:
customers_expectation_suite.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "age",
            "min_value": 0,
            "max_value": 120,
        },
    )
)

for column in gx_customers_df.columns:
    customers_expectation_suite.add_expectation(
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_null",
            kwargs={
                "column": column,
                "mostly": 0.0,
            },
        )
    )